Let's begin with the imports...
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud
from utils import load_corpus, split_train_test, getSceneData
%load_ext autoreload
%autoreload 2
Load the corpus into a DataFrame, and create a separate DataFrame for the 4 main characters.
df = load_corpus()
df_main_characters = df[df['character'].isin(["JERRY", "ELAINE", "KRAMER", "GEORGE"])]
Let's see how each entry in our DataFrame looks like...
df.head(3)
Split for each character and for funny/not-funny sentences.
Are funny sentences longer then not-funny ones?
def plot_per_character_cross_is_funny(df, colname):
with sns.plotting_context("notebook", font_scale=3.5):
g = sns.FacetGrid(df, col='character', row='is_funny', height=12, sharex=True, sharey=True)
g.map(sns.violinplot, colname, bins=50, color='b')
plt.show()
plot_per_character_cross_is_funny(df_main_characters, "num_words") # Sentence's length in #words
We can see that the distributions are slightly different between characters, and between funny and not-funny.
It seems that funny sentences have "bigger tail".
However, it's not obvious that this is significant.
Let's try to see it numerically...
df_main_characters.groupby(["character", "is_funny"]).describe()['num_words']
Let's try to see the distribution of the rate of the speech (#word per second).
We'll take Elaine as an example, as the other characters are about the same.
df['rate'] = df['num_words'] / df['length']
g = sns.FacetGrid(df[df['character'] == 'ELAINE'], col='is_funny', height=5, sharex=True, sharey=True, xlim=(0,8))
g.map(sns.distplot, "rate", bins=np.arange(0, 15, 0.25), color='b', kde=False, norm_hist=True)
Our conclusions from these visualization are to add more features to the sentence, such as #words, length (in seconds), speech-rate (words per second), etc.
We saw that the results actually improved when we gave them these additional features.
In order to get meaningful words we want to remove very frequent English words
First we split by character and see its word-cloud.
We try to see the patterns of words chosen by each character.
Are there words that characterize each one?
from gensim.parsing import remove_stopwords
plt.figure(figsize=(30, 15))
for i, character in enumerate(["JERRY", "ELAINE", "KRAMER", "GEORGE"]):
character_df = df[df['character'] == character]
character_text = " ".join(character_df.txt).lower()
character_text = remove_stopwords(character_text.replace('\'', ''))
wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white").generate(character_text)
plt.subplot(2, 2, i+1)
plt.title(character, {'fontsize' : 22})
plt.axis("off")
plt.imshow(wordcloud, interpolation="bilinear")
plt.show()
We can see that the most common words (bigger words in the word-cloud) are common between the characters.
Indeed, they are general words in English that everyone use, such as 'know', 'yeah', 'right', etc...
Let's try to see if we can see different patterns in the word-clouds of funny/not-funny sentences.
How are we going to do that?
We will filter out words that are not inductive for funny/not funny.
For each word we will calculate the frequency that the character use the word in funny/not funny sentences, and than look only of words that pass a certain threshold (parameter for the "algorithm")
def plot_wordcloud_only_freq(df, character, threshold=0.7):
"""
Plot word-cloud for the given character
"""
character_df = df[df['character'] == character]
character_funny_text = " ".join(character_df[character_df['is_funny'] == True].txt).lower()
character_not_funny_text = " ".join(character_df[character_df['is_funny'] == False].txt).lower()
vec = CountVectorizer().fit(character_df.txt)
bag_of_words = vec.transform(character_df.txt)
sum_words = bag_of_words.sum(axis=0)
words_freq = {word : sum_words[0, idx] for word, idx in vec.vocabulary_.items()}
funny_vec = CountVectorizer().fit(character_df[character_df['is_funny'] == True].txt)
funny_sum_words = vec.transform(character_df[character_df['is_funny'] == True].txt).sum(axis=0)
funny_words_freq = {word : funny_sum_words[0, idx] / words_freq[word] for word, idx in vec.vocabulary_.items()}
freq_funny_words = set([w for w, f in funny_words_freq.items() if f > threshold])
character_funny_text = ' '.join([word for word in character_funny_text.split(' ') if word in freq_funny_words])
non_funny_vec = CountVectorizer().fit(character_df[character_df['is_funny'] == False].txt)
non_funny_sum_words = vec.transform(character_df[character_df['is_funny'] == False].txt).sum(axis=0)
non_funny_words_freq = {word : non_funny_sum_words[0, idx] / words_freq[word] for word, idx in vec.vocabulary_.items()}
freq_non_funny_words = set([w for w, f in non_funny_words_freq.items() if f > threshold])
character_not_funny_text = ' '.join([word for word in character_not_funny_text.split(' ') if word in freq_non_funny_words])
wordcloud_funny = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(character_funny_text)
wordcloud_not_funny = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(character_not_funny_text)
plt.figure(figsize=(30, 30))
plt.subplot(121)
plt.title("%s - Funny" %(character), {'fontsize' : 20})
plt.axis("off")
plt.imshow(wordcloud_funny, interpolation="bilinear")
plt.subplot(122)
plt.title("%s - Not Funny" %(character),{'fontsize' : 20})
plt.axis("off")
plt.imshow(wordcloud_not_funny, interpolation="bilinear")
plt.show()
for character in ["JERRY", "ELAINE", "KRAMER", "GEORGE"]:
plot_wordcloud_only_freq(df, character, 0.5)
These word-clouds are just fun to look at...
We can see that there are many common frequent words between funny/not-funny sentences.
However, we can see some interesting issues for example we can see that:
When Jerry talks on dating or Elaine tells someone to shut it's usually funny (as we can imagine and see from the data)
While when any of them say: "think", "really", "gonna" etc... those are regular words that we don't expect to see in funny sentences
We want to understand the interaction between the different characters.
Of course all of us saw Seinfeld, but we want to visualize the connections!
We will draw a network graph, which is based on the scenes.
Each node is a character (from the nature of our data it's only characters that talked), where an edge between s1 and s2 means that s1 and s2 were together in the same scene. (i.e. if we have n characters in a scene we will have (n choose 2) edges from this scene).
We give weights to the edges according to the amount of times two characters
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
vec.fit(character_df.txt)
import networkx as nx
import itertools
from bokeh.io import show, output_notebook, reset_output #output_file,
from bokeh.plotting import figure
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes
from bokeh.models import HoverTool,BoxZoomTool, ResetTool, TapTool, Plot, Range1d, Circle, MultiLine
from bokeh.models.sources import ColumnDataSource
from bokeh.palettes import Spectral4
reset_output()
output_notebook()
We don't want the graph to crowded so we will filter out characters that doesn't appear often
charcter_apperance = df.groupby(['character']).size()
plt.hist(charcter_apperance, bins=np.arange(0,150))
plt.xlim(0,150)
plt.title("Amount of charcters according to number of apperances")
plt.show()
As always, there is no clear cut, we can see that until ~40, there are a lot of characters that we didn't see much.
After 40, the histogram is more smooth
So we will make the cut at 40
min_apperance = 40
freq_apperance = charcter_apperance[charcter_apperance > min_apperance]
print("we filter our {} charcters".format(len(charcter_apperance) - sum(charcter_apperance > min_apperance)))
df_scene = getSceneData(df[df.character.isin(freq_apperance.index)].reset_index())
df_scene.head(1)
community_df = df_scene[df_scene['character'].isin(freq_apperance.index)]
community_df = df_scene[['scene_number_in_episode', 'global_scene_number', 'scene_characters', 'is_funny', 'season']]
# add for each scene if the scene funny
funny_scene_df = pd.DataFrame(community_df.groupby(by='global_scene_number')['is_funny'].sum() > 0)
def add_funny_scecne(x, df):
return [x.is_funny] * sum(df.global_scene_number == x.name)
funny_scene_ls = funny_scene_df.apply(add_funny_scecne, args=(community_df,), axis=1)
funny_scene = [st for row in funny_scene_ls for st in row]
community_df.insert(loc=0, column='is_scene_funny', value=funny_scene)
community_df.head(1)
graph = []
for idx, row in community_df.iterrows():
if len(row['scene_characters']) > 1:
for comb in list(itertools.combinations(row['scene_characters'], 2)):
# sort by abc so always same order
char_1, char_2 = sorted(comb)
graph.append({'char_1' : char_1, 'char_2' : char_2, **row.drop('scene_characters').to_dict()})
df_graph = pd.DataFrame(graph)
weight_graph = df_graph[['char_1', 'char_2']].drop_duplicates()
weight_graph = weight_graph.set_index(['char_1', 'char_2'])
weight_graph['weight'] = df_graph.groupby(['char_1', 'char_2']).size()
weight_graph.reset_index(inplace=True)
weight_graph.head()
Although we filtered non frequent characters the graph was still to crowded, so we used another parameter _weightthreshold and removed edges with weight smaller than the threshold
weight_threshold = 50
selected_weight_graph = weight_graph[weight_graph.weight > weight_threshold]
g = nx.from_pandas_edgelist(selected_weight_graph, source='char_1', target='char_2',
edge_attr=['char_1', 'char_2', 'weight'])
plot = figure(title="senfiled charcters scenes", x_range=(-2.1,2.1),
y_range=(-2.1,2.1), tools="pan", plot_width=700, plot_height=400)
pos = nx.spring_layout
graph = from_networkx(g, pos, scale=2, center=(0,0))
# Change node size porotional to degree
node_size = {k : np.log(v) ** 2 for k,v in g.degree()}
nx.set_node_attributes(g, node_size, 'node_size')
source=ColumnDataSource(pd.DataFrame.from_dict({k:v for k,v in g.nodes(data=True)},orient='index'))
graph.node_renderer.data_source = source
graph.node_renderer.glyph = Circle(size='node_size', fill_color=Spectral4[0])
graph.node_renderer.selection_glyph = Circle(size='node_size', fill_color=Spectral4[2])
graph.node_renderer.hover_glyph = Circle(size='node_size', fill_color=Spectral4[1])
graph.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=5)
graph.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=5)
graph.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[1], line_width=5)
# set edge weight proprtional to the weight
edge_weight = np.array([g.get_edge_data(a,b)['weight'] for a, b in g.edges()])
edge_weight = edge_weight / 1000
graph.edge_renderer.data_source.data["line_width"] = edge_weight
graph.edge_renderer.glyph.line_width = {'field': 'line_width'}
# Add tooltip for edges with: both nodes & the weight
graph.inspection_policy = EdgesAndLinkedNodes()
edge_hover_tool = HoverTool(tooltips=[("char_1", "@char_1"), ("char_2", "@char_2"),
("#scence", "@weight")])
plot.add_tools(edge_hover_tool, BoxZoomTool(), ResetTool())
plot.renderers.append(graph)
show(plot)
Please note that the edges width and nodes sizes are proportional to the weight So a character that appears in more scenes has a larger vertex we can see that Elaine, George and Jerry appear in the highest number of scenes. We can also see that the "clique" between the four main characters (Elaine, George, Jerry and Kramer) had the largest width as expected.
Another interesting insights:
Newman can be seen mostly with Jerry or Kramer
Estelle (George mother) mostly talks with George, and after that with Jerry